*** This code graphs pre-pop liability over- and underestimates by AGI

cap log close
clear
set more off
set type double
set linesize 150

log using "$logdir/analysis/graph_over_underestimates_by_agi.log" , replace


local thresh 100


****************
* Prepare data *
****************

use "$statadir/soi_cdw_taxsim_2019.dta" if nonfiler==0 , clear

count

keep if agi_bin>0

quietly do "$adir/get_bad_dummies"
get_bad_dummies
*NOTE: this calculates the item-based failure indicators at the default tolerance of $100

drop bad_any

foreach xx of numlist 1 4 {
	gen double delta_tax`xx'     = taxsim`xx'_fed_incm_tax - soi_fed_incm_tax
	gen double delta_tax`xx'_pos = delta_tax`xx' if delta_tax`xx'>0
	gen double delta_tax`xx'_neg = delta_tax`xx' if delta_tax`xx'<0
	
	gen byte d_correct`thresh'_`xx' = (abs(delta_tax`xx')<`thresh')
	gen byte d_over`thresh'_`xx'    = (delta_tax`xx'>`thresh')
	gen byte d_under`thresh'_`xx'   = (delta_tax`xx'<-`thresh')
	
	gen double delta_tax_over`thresh'_`xx'  =  delta_tax`xx' if d_over`thresh'_`xx'==1
	gen double delta_tax_under`thresh'_`xx' = -delta_tax`xx' if d_under`thresh'_`xx'==1
}

gen frac_over  = d_over`thresh'_1
gen frac_under = d_under`thresh'_1

gen avg_delta_tax_over = delta_tax_over`thresh'_1
gen avg_delta_tax_under = delta_tax_under`thresh'_1

forval xx=1/20 {
	
	preserve
		keep if agi_bin==`xx' & d_over`thresh'_1==1
		keep delta_tax_over`thresh'_1 soi_wgt
		sort delta_tax_over`thresh'_1
		gen sum_wgt    = sum(soi_wgt)
		egen tot_wgt   = total(soi_wgt)
		gen percentile = sum_wgt/tot_wgt
		foreach pp of numlist 25 50 75 {
			gen dist_`pp' = abs(percentile - `=`pp'/100')
			sort dist_`pp'
			assert _N>=10
			qui sum delta_tax_over`thresh'_1 [aw=soi_wgt] if _n<=10 , meanonly
			local over_agi`xx'_p`pp' = r(mean)
		}
	restore
	
	preserve
		keep if agi_bin==`xx' & d_under`thresh'_1==1
		keep delta_tax_under`thresh'_1 soi_wgt
		sort delta_tax_under`thresh'_1
		gen sum_wgt    = sum(soi_wgt)
		egen tot_wgt   = total(soi_wgt)
		gen percentile = sum_wgt/tot_wgt
		foreach pp of numlist 25 50 75 {
			gen dist_`pp' = abs(percentile - `=`pp'/100')
			sort dist_`pp'
			assert _N>=10
			qui sum delta_tax_under`thresh'_1 [aw=soi_wgt] if _n<=10 , meanonly
			local under_agi`xx'_p`pp' = r(mean)
		}
	restore
}
*NOTE: we calculate blurred percentiles, using the weighted mean of the 10 observations closest to the actual percentile


collapse (mean) soi_agi frac* avg* [aw=soi_wgt] , by(agi_bin) fast

replace soi_agi = soi_agi/1000

niceloglabels soi_agi , style(125) local(labstr)

foreach pp of numlist 25 50 75 {
	
	gen p`pp'_delta_tax_over  = .
	gen p`pp'_delta_tax_under = .
	
	forval xx=1/20 {
		replace p`pp'_delta_tax_over  = `over_agi`xx'_p`pp''  if agi_bin==`xx'
		replace p`pp'_delta_tax_under = `under_agi`xx'_p`pp'' if agi_bin==`xx'
	}
}

gen avg_scaled_over = avg_delta_tax_over/abs(1000*soi_agi)
gen p25_scaled_over = p25_delta_tax_over/abs(1000*soi_agi)
gen p50_scaled_over = p50_delta_tax_over/abs(1000*soi_agi)
gen p75_scaled_over = p75_delta_tax_over/abs(1000*soi_agi)

gen avg_scaled_under = avg_delta_tax_under/abs(1000*soi_agi)
gen p25_scaled_under = p25_delta_tax_under/abs(1000*soi_agi)
gen p50_scaled_under = p50_delta_tax_under/abs(1000*soi_agi)
gen p75_scaled_under = p75_delta_tax_under/abs(1000*soi_agi)



******************************************************
* Graphs: Over- and underestimated liability, by AGI *
******************************************************

*** Shares with liability overestimated vs underestimated, by AGI ***

twoway (connected frac_over  soi_agi , lc(navy)   lp(dash)  msym(O) mcol(navy)  ) ///
	   (connected frac_under soi_agi , lc(ebblue) lp(solid) msym(O) mcol(ebblue)) , ///
	legend(order(1 "overestimated liability"  2 "underestimated liability")) ///
	ytitle("Share of taxpayers") ylabel(0(0.1)0.6 , format(%3.1fc) nogrid) ///
	xtitle("Adjusted gross income (000s)") graphregion(fcolor(white)) ///
	xscale(log) xlabel(`labstr')
graph export "$outdir/figures/too_low_too_high_share_by_agi.png" , width(1000) replace


*** Average dollars of liability overestimated vs underestimated (conditional on overestimated/underestimated), by AGI ***

twoway (connected avg_delta_tax_over  soi_agi , lc(navy)   lp(dash)  msym(O) mcol(navy)  ) ///
	   (connected avg_delta_tax_under soi_agi , lc(ebblue) lp(solid) msym(O) mcol(ebblue)) , ///
	legend(order(1 "overestimated liability"  2 "underestimated liability")) ///
	ytitle("Average error magnitude") ylabel( , format(%9.0fc) nogrid) ///
	xtitle("Adjusted gross income (000s)") graphregion(fcolor(white)) ///
	xscale(log) xlabel(`labstr')
graph export "$outdir/figures/too_low_too_high_avg_dollars_by_agi.png" , width(1000) replace


*** Average dollars of liability overestimated vs underestimated (conditional on overestimated/underestimated), scaled by AGI, by AGI ***

twoway (connected avg_scaled_over  soi_agi if agi_bin>1 , lc(navy)   lp(dash)  msym(O) mcol(navy)  ) ///
	   (connected avg_scaled_under soi_agi if agi_bin>1 , lc(ebblue) lp(solid) msym(O) mcol(ebblue)) , ///
	legend(order(1 "overestimated liability"  2 "underestimated liability")) ///
	ytitle("Average error magnitude / average AGI") ylabel( , format(%9.2fc) nogrid) ///
	xtitle("Adjusted gross income (000s)") graphregion(fcolor(white)) ///
	xscale(log) xlabel(`labstr')
graph export "$outdir/figures/too_low_too_high_avg_scaleddollars_by_agi.png" , width(1000) replace


*** 25th, 50th, 75th percentiles of liability overestimated (conditional on overestimated), by AGI ***

twoway (connected p25_delta_tax_over soi_agi , lc(navy)    lp(shortdash) msym(O) mcol(navy)   ) ///
	   (connected p50_delta_tax_over soi_agi , lc(edkblue) lp(dash)      msym(O) mcol(edkblue)) ///
	   (connected p75_delta_tax_over soi_agi , lc(ebblue)  lp(solid)     msym(O) mcol(ebblue) ) , ///
	legend(order(1 "25th percentile"  2 "50th percentile"  3 "75th percentile")) ///
	ytitle("Overestimated liability") ylabel( , format(%9.0fc) nogrid) ///
	xtitle("Adjusted gross income (000s)") graphregion(fcolor(white)) ///
	xscale(log) xlabel(`labstr')
graph export "$outdir/figures/too_high_p25_p50_p75_dollars_by_agi.png" , width(1000) replace


*** 25th, 50th, 75th percentiles of liability underestimated (conditional on underestimated), by AGI ***

twoway (connected p25_delta_tax_under soi_agi , lc(navy)    lp(shortdash) msym(O) mcol(navy)   ) ///
	   (connected p50_delta_tax_under soi_agi , lc(edkblue) lp(dash)      msym(O) mcol(edkblue)) ///
	   (connected p75_delta_tax_under soi_agi , lc(ebblue)  lp(solid)     msym(O) mcol(ebblue) ) , ///
	legend(order(1 "25th percentile"  2 "50th percentile"  3 "75th percentile")) ///
	ytitle("Underestimated liability") ylabel( , format(%9.0fc) nogrid) ///
	xtitle("Adjusted gross income (000s)") graphregion(fcolor(white)) ///
	xscale(log) xlabel(`labstr')
graph export "$outdir/figures/too_low_p25_p50_p75_dollars_by_agi.png" , width(1000) replace


cap log close
